/* Function atanf vectorized with SSE4.
   Copyright (C) 2021-2023 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   https://www.gnu.org/licenses/.  */

/*
 * ALGORITHM DESCRIPTION:
 *
 *      For    0.0    <= x <=  7.0/16.0: atan(x) = atan(0.0) + atan(s), where s=(x-0.0)/(1.0+0.0*x)
 *      For  7.0/16.0 <= x <= 11.0/16.0: atan(x) = atan(0.5) + atan(s), where s=(x-0.5)/(1.0+0.5*x)
 *      For 11.0/16.0 <= x <= 19.0/16.0: atan(x) = atan(1.0) + atan(s), where s=(x-1.0)/(1.0+1.0*x)
 *      For 19.0/16.0 <= x <= 39.0/16.0: atan(x) = atan(1.5) + atan(s), where s=(x-1.5)/(1.0+1.5*x)
 *      For 39.0/16.0 <= x <=    inf   : atan(x) = atan(inf) + atan(s), where s=-1.0/x
 *      Where atan(s) ~= s+s^3*Poly11(s^2) on interval |s|<7.0/0.16.
 *
 */

/* Offsets for data table __svml_satan_data_internal
 */
#define _sSIGN_MASK			0
#define _sABS_MASK			16
#define _sONE				32
#define _sPIO2				48
#define _sPC8				64
#define _sPC7				80
#define _sPC6				96
#define _sPC5				112
#define _sPC4				128
#define _sPC3				144
#define _sPC2				160
#define _sPC1				176
#define _sPC0				192

#include <sysdep.h>

	.section .text.sse4, "ax", @progbits
ENTRY(_ZGVbN4v_atanf_sse4)
	/*
	 * To use minps\maxps operations for argument reduction
	 * uncomment _AT_USEMINMAX_ definition
	 *  Declarations
	 * Variables
	 * Constants
	 */
	movups	_sABS_MASK+__svml_satan_data_internal(%rip), %xmm2

	/*
	 * 1) If x>1,      then r=-1/x, PIO2=Pi/2
	 * 2) If -1<=x<=1, then r=x,    PIO2=0
	 * 3) If x<-1,     then r=-1/x, PIO2=-Pi/2
	 */
	movups	_sONE+__svml_satan_data_internal(%rip), %xmm1
	andps	%xmm0, %xmm2
	movaps	%xmm2, %xmm9
	movaps	%xmm1, %xmm3
	cmpleps	%xmm1, %xmm9
	maxps	%xmm2, %xmm3
	minps	%xmm2, %xmm1
	divps	%xmm3, %xmm1
	movups	__svml_satan_data_internal(%rip), %xmm4
	movaps	%xmm9, %xmm10
	andps	%xmm4, %xmm0
	andnps	%xmm4, %xmm9
	pxor	%xmm0, %xmm9
	pxor	%xmm1, %xmm9

	/* Polynomial. */
	movaps	%xmm9, %xmm8
	mulps	%xmm9, %xmm8
	movaps	%xmm8, %xmm7
	mulps	%xmm8, %xmm7
	movups	_sPC8+__svml_satan_data_internal(%rip), %xmm6
	mulps	%xmm7, %xmm6
	movups	_sPC7+__svml_satan_data_internal(%rip), %xmm5
	mulps	%xmm7, %xmm5
	addps	_sPC6+__svml_satan_data_internal(%rip), %xmm6
	mulps	%xmm7, %xmm6
	addps	_sPC5+__svml_satan_data_internal(%rip), %xmm5
	mulps	%xmm7, %xmm5
	addps	_sPC4+__svml_satan_data_internal(%rip), %xmm6
	mulps	%xmm7, %xmm6
	addps	_sPC3+__svml_satan_data_internal(%rip), %xmm5
	mulps	%xmm5, %xmm7
	addps	_sPC2+__svml_satan_data_internal(%rip), %xmm6
	mulps	%xmm8, %xmm6
	addps	_sPC1+__svml_satan_data_internal(%rip), %xmm7
	andnps	_sPIO2+__svml_satan_data_internal(%rip), %xmm10
	addps	%xmm6, %xmm7
	mulps	%xmm7, %xmm8
	pxor	%xmm0, %xmm10
	addps	_sPC0+__svml_satan_data_internal(%rip), %xmm8

	/* Reconstruction. */
	mulps	%xmm8, %xmm9
	addps	%xmm9, %xmm10
	movaps	%xmm10, %xmm0
	ret

END(_ZGVbN4v_atanf_sse4)

	.section .rodata, "a"
	.align	16

#ifdef __svml_satan_data_internal_typedef
typedef unsigned int VUINT32;
typedef struct {
	__declspec(align(16)) VUINT32 _sSIGN_MASK[4][1];
	__declspec(align(16)) VUINT32 _sABS_MASK[4][1];
	__declspec(align(16)) VUINT32 _sONE[4][1];
	__declspec(align(16)) VUINT32 _sPIO2[4][1];
	__declspec(align(16)) VUINT32 _sPC8[4][1];
	__declspec(align(16)) VUINT32 _sPC7[4][1];
	__declspec(align(16)) VUINT32 _sPC6[4][1];
	__declspec(align(16)) VUINT32 _sPC5[4][1];
	__declspec(align(16)) VUINT32 _sPC4[4][1];
	__declspec(align(16)) VUINT32 _sPC3[4][1];
	__declspec(align(16)) VUINT32 _sPC2[4][1];
	__declspec(align(16)) VUINT32 _sPC1[4][1];
	__declspec(align(16)) VUINT32 _sPC0[4][1];
} __svml_satan_data_internal;
#endif
__svml_satan_data_internal:
	.long	0x80000000, 0x80000000, 0x80000000, 0x80000000 // _sSIGN_MASK
	.align	16
	.long	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF // _sABS_MASK
	.align	16
	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sONE
	.align	16
	.long	0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB, 0x3FC90FDB // _sPIO2
	.align	16
	.long	0x3B322CC0, 0x3B322CC0, 0x3B322CC0, 0x3B322CC0 // _sPC8
	.align	16
	.long	0xBC7F2631, 0xBC7F2631, 0xBC7F2631, 0xBC7F2631 // _sPC7
	.align	16
	.long	0x3D2BC384, 0x3D2BC384, 0x3D2BC384, 0x3D2BC384 // _sPC6
	.align	16
	.long	0xBD987629, 0xBD987629, 0xBD987629, 0xBD987629 // _sPC5
	.align	16
	.long	0x3DD96474, 0x3DD96474, 0x3DD96474, 0x3DD96474 // _sPC4
	.align	16
	.long	0xBE1161F8, 0xBE1161F8, 0xBE1161F8, 0xBE1161F8 // _sPC3
	.align	16
	.long	0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F, 0x3E4CB79F // _sPC2
	.align	16
	.long	0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49, 0xBEAAAA49 // _sPC1
	.align	16
	.long	0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000 // _sPC0
	.align	16
	.type	__svml_satan_data_internal, @object
	.size	__svml_satan_data_internal, .-__svml_satan_data_internal
